In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Cross-Validation


In [ ]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier

iris = load_iris()
X, y = iris.data, iris.target
n_samples = X.shape[0]
print(X.shape)
print(y.shape)
print(y)

Validation with a training / test split


In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape)
print(X_test.shape)

In [ ]:
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

Cross-validation for more robust estimates


In [ ]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(classifier, X, y)
print(scores)
print(np.mean(scores))

In [ ]:
cross_val_score(classifier, X, y, cv=5)

Custom cross-validation iterators


In [ ]:
from sklearn.cross_validation import KFold, StratifiedKFold, ShuffleSplit, LeavePLabelOut

In [ ]:
cv = StratifiedKFold(iris.target, n_folds=5)
for train, test in cv:
    print(test)

In [ ]:
def plot_cv(cv, n_samples):
    masks = []
    for train, test in cv:
        mask = np.zeros(n_samples, dtype=bool)
        mask[test] = 1
        masks.append(mask)
    
    plt.matshow(masks)

In [ ]:
cv = StratifiedKFold(y, n_folds=5)
plot_cv(cv, n_samples)

In [ ]:
cv = KFold(n_samples, n_folds=5)
plot_cv(cv, n_samples)

In [ ]:
cv = KFold(n_samples, n_folds=5, shuffle=True)
plot_cv(cv, n_samples)

In [ ]:
cv = KFold(n_samples, n_folds=10)
plot_cv(cv, n_samples)

In [ ]:
cv = ShuffleSplit(n_samples, n_iter=5, test_size=.2)
plot_cv(cv, n_samples)

In [ ]:
cv = ShuffleSplit(n_samples, n_iter=20, test_size=.2)
plot_cv(cv, n_samples)

In [ ]:
cv = ShuffleSplit(n_samples, n_iter=5, test_size=.2)
cross_val_score(classifier, X, y, cv=cv)

In [ ]: